Introduction

The key procedure in external memory inverted index construction is the external memory multiway merge algorithm.


In [20]:
from heapq import heappush, heappop, merge
# documentation: https://docs.python.org/3.4/library/heapq.html
from math import ceil

## the heap is essentially a min-heap

DEBUG = True

def p(msg):
    if DEBUG:
        print('.. {}'.format(msg))

## making use of the heapq.merge() function to simulate the in-memory heap based merging
## (see source at https://fossies.org/dox/Python-3.5.2/heapq_8py_source.html)
def merge_runs(list_of_runs):
    return list(merge(*list_of_runs))

In [29]:
import random

random.seed(1999)

n1 = n2 = 5
n3 = 2

run1 = sorted([random.randint(1, 100) for _ in range(n1)])
run2 = sorted([random.randint(1, 100) for _ in range(n2)])
run3 = sorted([random.randint(1, 100) for _ in range(n3)])

print(run1, run2, run3)

runs = [run1, run2, run3]
new_run = merge_runs(runs)
print(new_run)


[15, 19, 46, 73, 93] [26, 33, 64, 73, 80] [7, 22, 27, 77, 83]
[7, 15, 19, 22, 26, 27, 33, 46, 64, 73, 73, 77, 80, 83, 93]

Exercise

Implement the merge-way merge function multiway_merge(data, k, pagesize), where data is an unordered list of integers, k is the maximum number of runs to merge simultaneously, and pagesize is the maximum number of integers in the initial runs.


In [ ]:
def multiway_merge(data, k, pagesize):
    p('{}-way merge, initial page size = {}'.format(k, pagesize))
    ###
    ### INSERT YOUR OWN CODE
    ###

If you have implemented it correctly, if you run the code below, you should see the following output.


In [31]:
import random

random.seed(1999)
n = 23
data = [random.randint(1, 100) for _ in range(n)]

sorted_data = multiway_merge(data, 3, 2)
print(sorted_data)
print()

sorted_data = multiway_merge(data, 2, 2)
print(sorted_data)


.. 3-way merge, initial page size = 2
.. Initial runs = [[15, 46], [19, 93], [64, 73], [33, 80], [26, 73], [22, 77], [27, 83], [7, 52], [12, 51], [7, 80], [19, 95], [36]]
.. New runs size = 4. Runs = [[15, 19, 46, 64, 73, 93], [22, 26, 33, 73, 77, 80], [7, 12, 27, 51, 52, 83], [7, 19, 36, 80, 95]]
.. New runs size = 2. Runs = [[7, 12, 15, 19, 22, 26, 27, 33, 46, 51, 52, 64, 73, 73, 77, 80, 83, 93], [7, 19, 36, 80, 95]]
.. New runs size = 1. Runs = [[7, 7, 12, 15, 19, 19, 22, 26, 27, 33, 36, 46, 51, 52, 64, 73, 73, 77, 80, 80, 83, 93, 95]]
[7, 7, 12, 15, 19, 19, 22, 26, 27, 33, 36, 46, 51, 52, 64, 73, 73, 77, 80, 80, 83, 93, 95]

.. 2-way merge, initial page size = 2
.. Initial runs = [[15, 46], [19, 93], [64, 73], [33, 80], [26, 73], [22, 77], [27, 83], [7, 52], [12, 51], [7, 80], [19, 95], [36]]
.. New runs size = 6. Runs = [[15, 19, 46, 93], [33, 64, 73, 80], [22, 26, 73, 77], [7, 27, 52, 83], [7, 12, 51, 80], [19, 36, 95]]
.. New runs size = 3. Runs = [[15, 19, 33, 46, 64, 73, 80, 93], [7, 22, 26, 27, 52, 73, 77, 83], [7, 12, 19, 36, 51, 80, 95]]
.. New runs size = 2. Runs = [[7, 15, 19, 22, 26, 27, 33, 46, 52, 64, 73, 73, 77, 80, 83, 93], [7, 12, 19, 36, 51, 80, 95]]
.. New runs size = 1. Runs = [[7, 7, 12, 15, 19, 19, 22, 26, 27, 33, 36, 46, 51, 52, 64, 73, 73, 77, 80, 80, 83, 93, 95]]
[7, 7, 12, 15, 19, 19, 22, 26, 27, 33, 36, 46, 51, 52, 64, 73, 73, 77, 80, 80, 83, 93, 95]

In [ ]:


In [ ]:


In [ ]: